Replication of "FRED-MD: A Monthly Database for Macroeconomic Research"¶

Advanced Econometrics Final Assignment
Academic Year 2024/2025
Tommaso de Martino, 1964326

Libraries¶

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import certifi
from scipy import linalg
from numpy.linalg import svd
import matplotlib.pyplot as plt
from IPython.display import display, HTML, Image
import re
import statsmodels.api as sm
from itertools import product
import time

Data¶

Collecting Data¶

The dataset used in this project was originally downloaded from the FRED-MD Monthly Databases for Macroeconomic Research, available on the Federal Reserve Bank of St. Louis website here.

To ensure reproducibility and consistent access to the data, a copy of the CSV file has been uploaded to a public GitHub repository.

The corresponding file on the FRED-MD website is named 2025-05, reflecting the dataset version used in this project.

In [2]:
URL = ('https://raw.githubusercontent.com/tdistudent27/fred-md-data/refs/heads/main/2025-05.csv')
In [3]:
response = requests.get(URL, timeout=30, verify=certifi.where())
response.raise_for_status()          # lancia eccezione se HTTP≠200

df = pd.read_csv(io.BytesIO(response.content)) 
In [4]:
print(df)
        sasdate        RPI  W875RX1  DPCERA3M086SBEA     CMRMTSPLx  \
0    Transform:      5.000      5.0            5.000  5.000000e+00   
1      1/1/1959   2583.560   2426.0           15.188  2.766768e+05   
2      2/1/1959   2593.596   2434.8           15.346  2.787140e+05   
3      3/1/1959   2610.396   2452.7           15.491  2.777753e+05   
4      4/1/1959   2627.446   2470.0           15.435  2.833627e+05   
..          ...        ...      ...              ...           ...   
791   11/1/2024  20091.169  16376.8          122.396  1.545040e+06   
792   12/1/2024  20101.629  16387.7          123.077  1.558008e+06   
793    1/1/2025  20148.969  16391.2          122.614  1.543178e+06   
794    2/1/2025  20209.351  16389.5          122.742  1.556553e+06   
795    3/1/2025  20311.260  16500.4          123.601           NaN   

          RETAILx    INDPRO   IPFPNSS   IPFINAL   IPCONGD  ...  \
0         5.00000    5.0000    5.0000    5.0000    5.0000  ...   
1     17689.23968   21.9616   23.3868   22.2620   31.6664  ...   
2     17819.01912   22.3917   23.7024   22.4549   31.8987  ...   
3     17967.91336   22.7142   23.8459   22.5651   31.8987  ...   
4     17978.97983   23.1981   24.1903   22.8957   32.4019  ...   
..            ...       ...       ...       ...       ...  ...   
791  712145.00000  101.9619   99.3808   98.8609  100.8691  ...   
792  717662.00000  103.1177  100.4976   99.9719  101.6868  ...   
793  711461.00000  103.3418  101.0766  100.6319  102.1879  ...   
794  711680.00000  104.2202  101.8233  101.4377  102.7245  ...   
795  722025.00000  103.8892  101.6665  101.1465  101.7332  ...   

     DNDGRG3M086SBEA  DSERRG3M086SBEA  CES0600000008  CES2000000008  \
0              6.000            6.000           6.00           6.00   
1             18.294           10.152           2.13           2.45   
2             18.302           10.167           2.14           2.46   
3             18.289           10.185           2.15           2.45   
4             18.300           10.221           2.16           2.47   
..               ...              ...            ...            ...   
791          119.230          129.380          31.59          36.26   
792          119.746          129.875          31.72          36.43   
793          120.457          130.281          31.91          36.56   
794          120.615          130.990          32.00          36.66   
795          119.760          131.192          32.21          36.79   

     CES3000000008  UMCSENTx  DTCOLNVHFNM   DTCTHFNM     INVEST  VIXCLSx  
0             6.00       2.0         6.00       6.00     6.0000   1.0000  
1             2.04       NaN      6476.00   12298.00    84.2043      NaN  
2             2.05       NaN      6476.00   12298.00    83.5280      NaN  
3             2.07       NaN      6508.00   12349.00    81.6405      NaN  
4             2.08       NaN      6620.00   12484.00    81.8099      NaN  
..             ...       ...          ...        ...        ...      ...  
791          28.22      71.8    556011.41  938335.20  5381.4576  15.9822  
792          28.33      74.0    559364.75  943484.76  5366.6686  15.6997  
793          28.58      71.7    559087.09  944167.06  5350.2541  16.8122  
794          28.68      64.7    556142.06  941199.49  5367.9408  17.0705  
795          28.92      57.0          NaN        NaN  5406.5887  21.6579  

[796 rows x 127 columns]

Extracting TCODE variables¶

In [5]:
# Extract the first row, which contains the transformation codes
tcode_row = df.iloc[0]

# Drop the 'sasdate' column since it's not an economic variable
tcode_row = tcode_row.drop('sasdate')

# Create a one-row DataFrame with the variable names as columns
# and the index labeled as 'Transform'
tcode_df = pd.DataFrame([tcode_row.values], columns=tcode_row.index, index=['TCODE'])
In [6]:
# Extract the first row, which contains the transformation codes
tcode_df = df.iloc[0]

# Drop the 'sasdate' column since it's not an economic variable
tcode_df = tcode_df.drop('sasdate')

# Create a one-row DataFrame with the variable names as columns
# and the index labeled as 'Transform'
tcode_df = pd.DataFrame([tcode_df.values], columns=tcode_df.index, index=['TCODE'])
In [7]:
print(tcode_df)
       RPI  W875RX1  DPCERA3M086SBEA  CMRMTSPLx  RETAILx  INDPRO  IPFPNSS  \
TCODE  5.0      5.0              5.0        5.0      5.0     5.0      5.0   

       IPFINAL  IPCONGD  IPDCONGD  ...  DNDGRG3M086SBEA  DSERRG3M086SBEA  \
TCODE      5.0      5.0       5.0  ...              6.0              6.0   

       CES0600000008  CES2000000008  CES3000000008  UMCSENTx  DTCOLNVHFNM  \
TCODE            6.0            6.0            6.0       2.0          6.0   

       DTCTHFNM  INVEST  VIXCLSx  
TCODE       6.0     6.0      1.0  

[1 rows x 126 columns]

Adjusting the dataset¶

In [8]:
# Drop the first row (index 0) which contains the transformation codes
df = df.iloc[1:].reset_index(drop=True)

# Rename 'sasdate' to 'Date'
df = df.rename(columns={'sasdate': 'Date'})

# Convert 'Date' to datetime in ISO format (YYYY-MM-DD)
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')

# Convert all columns except 'Date' to numeric values
df[df.columns.difference(['Date'])] = df[df.columns.difference(['Date'])].apply(pd.to_numeric, errors='coerce')
In [9]:
# Check
print(df)
          Date        RPI  W875RX1  DPCERA3M086SBEA     CMRMTSPLx  \
0   1959-01-01   2583.560   2426.0           15.188  2.766768e+05   
1   1959-02-01   2593.596   2434.8           15.346  2.787140e+05   
2   1959-03-01   2610.396   2452.7           15.491  2.777753e+05   
3   1959-04-01   2627.446   2470.0           15.435  2.833627e+05   
4   1959-05-01   2642.720   2486.4           15.622  2.853072e+05   
..         ...        ...      ...              ...           ...   
790 2024-11-01  20091.169  16376.8          122.396  1.545040e+06   
791 2024-12-01  20101.629  16387.7          123.077  1.558008e+06   
792 2025-01-01  20148.969  16391.2          122.614  1.543178e+06   
793 2025-02-01  20209.351  16389.5          122.742  1.556553e+06   
794 2025-03-01  20311.260  16500.4          123.601           NaN   

          RETAILx    INDPRO   IPFPNSS   IPFINAL   IPCONGD  ...  \
0     17689.23968   21.9616   23.3868   22.2620   31.6664  ...   
1     17819.01912   22.3917   23.7024   22.4549   31.8987  ...   
2     17967.91336   22.7142   23.8459   22.5651   31.8987  ...   
3     17978.97983   23.1981   24.1903   22.8957   32.4019  ...   
4     18119.82573   23.5476   24.3911   23.1161   32.5567  ...   
..            ...       ...       ...       ...       ...  ...   
790  712145.00000  101.9619   99.3808   98.8609  100.8691  ...   
791  717662.00000  103.1177  100.4976   99.9719  101.6868  ...   
792  711461.00000  103.3418  101.0766  100.6319  102.1879  ...   
793  711680.00000  104.2202  101.8233  101.4377  102.7245  ...   
794  722025.00000  103.8892  101.6665  101.1465  101.7332  ...   

     DNDGRG3M086SBEA  DSERRG3M086SBEA  CES0600000008  CES2000000008  \
0             18.294           10.152           2.13           2.45   
1             18.302           10.167           2.14           2.46   
2             18.289           10.185           2.15           2.45   
3             18.300           10.221           2.16           2.47   
4             18.280           10.238           2.17           2.48   
..               ...              ...            ...            ...   
790          119.230          129.380          31.59          36.26   
791          119.746          129.875          31.72          36.43   
792          120.457          130.281          31.91          36.56   
793          120.615          130.990          32.00          36.66   
794          119.760          131.192          32.21          36.79   

     CES3000000008  UMCSENTx  DTCOLNVHFNM   DTCTHFNM     INVEST  VIXCLSx  
0             2.04       NaN      6476.00   12298.00    84.2043      NaN  
1             2.05       NaN      6476.00   12298.00    83.5280      NaN  
2             2.07       NaN      6508.00   12349.00    81.6405      NaN  
3             2.08       NaN      6620.00   12484.00    81.8099      NaN  
4             2.08      95.3      6753.00   12646.00    80.7315      NaN  
..             ...       ...          ...        ...        ...      ...  
790          28.22      71.8    556011.41  938335.20  5381.4576  15.9822  
791          28.33      74.0    559364.75  943484.76  5366.6686  15.6997  
792          28.58      71.7    559087.09  944167.06  5350.2541  16.8122  
793          28.68      64.7    556142.06  941199.49  5367.9408  17.0705  
794          28.92      57.0          NaN        NaN  5406.5887  21.6579  

[795 rows x 127 columns]

Applying the correct TCODE to each variable¶

To apply the appropriate transformation to each variable, I built a function that reads the transformation code (TCODE) associated with each series and applies the corresponding operation.

These codes are provided in the first row of the original dataset and were stored separately for clarity.

The function loops through all columns (except the Date), looks up the TCODE for each one, and applies the correct transformation using a dictionary that maps each code to a specific operation. This approach ensures that each variable is treated consistently and according to the definitions used in the FRED-MD dataset.

The column TCODE denotes the following data transformation for a series $x$:

  1. $\text{no transformation}$;

  2. $\Delta x_t$;

  3. $\Delta^2 x_t$;

  4. $log(x_t)$;

  5. $\Delta log(x_t)$;

  6. $\Delta^2 log(x_t)$;

  7. $\Delta(x_t / x_{t-1} - 1.0)$.

In [10]:
tcode_functions = {
    1: lambda x: x,                               # (1) Level
    2: lambda x: x.diff(),                        # (2) Δx
    3: lambda x: x.diff().diff(),                 # (3) Δ²x
    4: lambda x: np.log(x),                       # (4) log(x)
    5: lambda x: np.log(x).diff(),                # (5) Δlog(x)
    6: lambda x: np.log(x).diff().diff(),         # (6) Δ²log(x)
    7: lambda x: (x/x.shift(1)-1).diff()          # (7) Δ(x/x₋₁ - 1)
}
In [11]:
def apply_transformations(df, tcode_df):
    df_transformed = df.copy()
    for col in df.columns:
        if col == 'Date':
            continue
        tcode = int(tcode_df[col])
        func = tcode_functions.get(tcode)
        if func:
            df_transformed[col] = func(df[col])
        else:
            print(f"Warning: No transformation defined for TCODE {tcode} in column {col}")
    return df_transformed
In [12]:
# Apply the transformations
df_transformed = apply_transformations(df, tcode_df.loc['TCODE'])
# Now remove the first 2 rows
df_transformed = df_transformed.iloc[2:].reset_index(drop=True)

Another important thing to do is check the -inf with NaN, since they may be the result of log(0).

In [13]:
# Count +inf values per column
pos_inf_count = (df_transformed == np.inf).sum()

# Count -inf values per column
neg_inf_count = (df_transformed == -np.inf).sum()

# Display only columns where +inf or -inf values are present
print("Columns with +inf values:\n", pos_inf_count[pos_inf_count > 0])
print("\nColumns with -inf values:\n", neg_inf_count[neg_inf_count > 0])
Columns with +inf values:
 Series([], dtype: int64)

Columns with -inf values:
 Series([], dtype: int64)

So there are no inf values

In [14]:
df_transformed.set_index('Date', inplace=True)
print(df_transformed)
                 RPI   W875RX1  DPCERA3M086SBEA  CMRMTSPLx   RETAILx  \
Date                                                                   
1959-03-01  0.006457  0.007325         0.009404  -0.003374  0.008321   
1959-04-01  0.006510  0.007029        -0.003622   0.019915  0.000616   
1959-05-01  0.005796  0.006618         0.012043   0.006839  0.007803   
1959-06-01  0.003068  0.003012         0.003642  -0.000097  0.009064   
1959-07-01 -0.000580 -0.000762        -0.003386   0.012155 -0.000330   
...              ...       ...              ...        ...       ...   
2024-11-01  0.001798  0.002507         0.004463   0.004270  0.006384   
2024-12-01  0.000520  0.000665         0.005548   0.008358  0.007717   
2025-01-01  0.002352  0.000214        -0.003769  -0.009564 -0.008678   
2025-02-01  0.002992 -0.000104         0.001043   0.008630  0.000308   
2025-03-01  0.005030  0.006744         0.006974        NaN  0.014431   

              INDPRO   IPFPNSS   IPFINAL   IPCONGD  IPDCONGD  ...  \
Date                                                          ...   
1959-03-01  0.014300  0.006036  0.004896  0.000000  0.019397  ...   
1959-04-01  0.021080  0.014339  0.014545  0.015652  0.006379  ...   
1959-05-01  0.014954  0.008267  0.009580  0.004766  0.020152  ...   
1959-06-01  0.001137  0.007035  0.007125 -0.004766  0.007453  ...   
1959-07-01 -0.024237  0.001168  0.008251  0.013056  0.019609  ...   
...              ...       ...       ...       ...       ...  ...   
2024-11-01 -0.002467 -0.002353 -0.002039 -0.006307  0.014193  ...   
2024-12-01  0.011272  0.011175  0.011175  0.008074 -0.015573  ...   
2025-01-01  0.002171  0.005745  0.006580  0.004916 -0.020199  ...   
2025-02-01  0.008464  0.007360  0.007976  0.005237  0.043469  ...   
2025-03-01 -0.003181 -0.001541 -0.002875 -0.009697  0.005102  ...   

            DNDGRG3M086SBEA  DSERRG3M086SBEA  CES0600000008  CES2000000008  \
Date                                                                         
1959-03-01        -0.001148         0.000292      -0.000022      -0.008147   
1959-04-01         0.001312         0.001760      -0.000022       0.012203   
1959-05-01        -0.001695        -0.001867      -0.000021      -0.004090   
1959-06-01         0.003334         0.001946      -0.004619       0.003992   
1959-07-01        -0.001204        -0.000013       0.000000      -0.004040   
...                     ...              ...            ...            ...   
2024-11-01         0.000117        -0.002130      -0.000639      -0.002760   
2024-12-01         0.004218         0.002179       0.002206       0.004953   
2025-01-01         0.001602        -0.000697       0.001865      -0.001115   
2025-02-01        -0.004609         0.002306      -0.003156      -0.000831   
2025-03-01        -0.008425        -0.003886       0.003725       0.000808   

            CES3000000008  UMCSENTx  DTCOLNVHFNM  DTCTHFNM    INVEST  VIXCLSx  
Date                                                                           
1959-03-01       0.004819       NaN     0.004929  0.004138 -0.014792      NaN  
1959-04-01      -0.004890       NaN     0.012134  0.006734  0.024929      NaN  
1959-05-01      -0.004819       NaN     0.002828  0.002020 -0.015342      NaN  
1959-06-01       0.004796       NaN     0.009726  0.009007 -0.012252      NaN  
1959-07-01      -0.004796       NaN    -0.004631 -0.001000  0.029341      NaN  
...                   ...       ...          ...       ...       ...      ...  
2024-11-01       0.003190       1.3    -0.000872 -0.001684 -0.011910  15.9822  
2024-12-01      -0.001439       2.2     0.004047  0.004152  0.002154  15.6997  
2025-01-01       0.004895      -2.3    -0.006509 -0.004750 -0.000311  16.8122  
2025-02-01      -0.005293      -7.0    -0.004785 -0.003871  0.006364  17.0705  
2025-03-01       0.004841      -7.7          NaN       NaN  0.003874  21.6579  

[793 rows x 126 columns]

Outliers¶

Now let's replace the outliers with NaN

An outlier is defined as an observation that deviates from the sample median by more than ten interquartile ranges.

In [15]:
# Medians and interquartile ranges
median = df_transformed.median()
Q1 = df_transformed.quantile(0.25)
Q3 = df_transformed.quantile(0.75)
IQR = Q3 - Q1

# Absolute distance from median
diff = (df_transformed - median).abs()

# Boolean mask for outliers
outlier_mask = diff > (10 * IQR)

# Removing outliers and setting them to NaN
df_transformed[outlier_mask] = np.nan

tot_outliers = outlier_mask.sum().sum()
print(f'Outliers found and set NaN:\n{tot_outliers}')
Outliers found and set NaN:
159

Iterative Expectation-Maximization algorithm¶

Estimation of the static factors by PCA adapted to allow for missing values. This is essentially the EM algorithm given in Stock and Watson (2002).

Observations that are missing are initialized to the unconditional mean based on the non-missing values (which is zero since the data are demeaned and standardized) so that the panel is re-balanced

Now let's check for NaN values

In [16]:
nan_count = df_transformed.isna().sum()
print("NaN values per column:\n", nan_count[nan_count > 0])
NaN values per column:
 RPI                  7
W875RX1              2
DPCERA3M086SBEA      3
CMRMTSPLx            2
RETAILx              2
                  ... 
CUSR0000SAS          1
UMCSENTx           227
DTCOLNVHFNM          9
DTCTHFNM             7
VIXCLSx             40
Length: 72, dtype: int64

In this section, I estimate the latent static factors from the transformed FRED-MD dataset using an Expectation-Maximization Principal Component Analysis (EM-PCA) procedure. This method follows the approach described in McCracken and Ng (2015).

The estimation technique is based on the algorithm proposed by Stock and Watson (2002), which allows principal component analysis to be performed in the presence of NaN data by iteratively imputing missing values and re-estimating factors until convergence.

Computing the mean for each variable¶

In [17]:
# Now let's compute the unconditional mean for each variable excluding the NaN values
nan_mean = df_transformed.mean(skipna=True)
In [18]:
print(nan_mean)
RPI                 0.002520
W875RX1             0.002575
DPCERA3M086SBEA     0.002776
CMRMTSPLx           0.002326
RETAILx             0.004653
                     ...    
UMCSENTx           -0.047173
DTCOLNVHFNM        -0.000008
DTCTHFNM            0.000005
INVEST              0.000019
VIXCLSx            19.293502
Length: 126, dtype: float64

Filling the NaN with the mean¶

In [19]:
df_nan = df_transformed.copy()

# Let's substitute the NaN with the mean of the variable
for col in nan_mean.index:
    df_nan[col] = df_nan[col].fillna(nan_mean[col])

Standardizing the Data¶

  • Standardize each variable using:

    $$ z_{it} = \frac{x_{it} - \mu_i}{\sigma_i} $$

  • This ensures that each variable has mean 0 and standard deviation 1.

In [20]:
# Now compute standard deviation and the mean on the filled version
std = df_nan.std()
mean = df_nan.mean()

# Create a new standardized DataFrame
df_standardized = df_nan.copy()

# Standardize each column (subtract mean and divide by std)
for col in mean.index:
    df_standardized[col] = (df_nan[col] - mean[col]) / std[col]
In [21]:
print(df_standardized)
                 RPI   W875RX1  DPCERA3M086SBEA  CMRMTSPLx   RETAILx  \
Date                                                                   
1959-03-01  0.743417  0.913692         1.177891  -0.490603  0.278756   
1959-04-01  0.753564  0.856719        -1.136802   1.514077 -0.306780   
1959-05-01  0.618746  0.777659         1.646691   0.388474  0.239408   
1959-06-01  0.103557  0.083970         0.153936  -0.208534  0.335227   
1959-07-01 -0.585383 -0.642070        -1.094952   0.846037 -0.378658   
...              ...       ...              ...        ...       ...   
2024-11-01 -0.136278 -0.013218         0.299764   0.167340  0.131565   
2024-12-01 -0.377567 -0.367449         0.492706   0.519265  0.232856   
2025-01-01 -0.031653 -0.454366        -1.162997  -1.023462 -1.013013   
2025-02-01  0.089212 -0.515402        -0.307847   0.542640 -0.330181   
2025-03-01  0.474014  0.801900         0.746027   0.000000  0.743067   

              INDPRO   IPFPNSS   IPFINAL   IPCONGD  IPDCONGD  ...  \
Date                                                          ...   
1959-03-01  1.432224  0.498939  0.314042 -0.168161  0.675422  ...   
1959-04-01  2.229303  1.531037  1.396674  1.458580  0.160896  ...   
1959-05-01  1.509062  0.776197  0.839660  0.327197  0.705275  ...   
1959-06-01 -0.115156  0.623132  0.564228 -0.663519  0.203325  ...   
1959-07-01 -3.098162 -0.106174  0.690570  1.188749  0.683799  ...   
...              ...       ...       ...       ...       ...  ...   
2024-11-01 -0.538955 -0.543769 -0.464048 -0.823675  0.469722  ...   
2024-12-01  1.076242  1.137695  1.018633  0.670981 -0.706824  ...   
2025-01-01  0.006333  0.462745  0.503050  0.342751 -0.889667  ...   
2025-02-01  0.746155  0.663549  0.659610  0.376176  1.626893  ...   
2025-03-01 -0.622839 -0.442876 -0.557812 -1.175995  0.110389  ...   

            DNDGRG3M086SBEA  DSERRG3M086SBEA  CES0600000008  CES2000000008  \
Date                                                                         
1959-03-01        -0.185725         0.183861      -0.006168      -0.922779   
1959-04-01         0.215603         1.106570      -0.006116       1.382487   
1959-05-01        -0.274979        -1.173973      -0.006065      -0.463208   
1959-06-01         0.545536         1.223616      -1.178946       0.452264   
1959-07-01        -0.194877        -0.008207      -0.000597      -0.457593   
...                     ...              ...            ...            ...   
2024-11-01         0.020714        -1.339505      -0.163703      -0.312603   
2024-12-01         0.689762         1.370221       0.562088       0.561171   
2025-01-01         0.262880        -0.438692       0.475253      -0.126262   
2025-02-01        -0.750521         1.450354      -0.805624      -0.094019   
2025-03-01        -1.373094        -2.444361       0.949592       0.091644   

            CES3000000008  UMCSENTx   DTCOLNVHFNM      DTCTHFNM    INVEST  \
Date                                                                        
1959-03-01       1.023820  0.000000  2.605637e-01  3.884093e-01 -1.369033   
1959-04-01      -1.040701  0.000000  6.408395e-01  6.323378e-01  2.302433   
1959-05-01      -1.025764  0.000000  1.496799e-01  1.893801e-01 -1.419858   
1959-06-01       1.019002  0.000000  5.137570e-01  8.458715e-01 -1.134212   
1959-07-01      -1.020848  0.000000 -2.440059e-01 -9.439948e-02  2.710237   
...                   ...       ...           ...           ...       ...   
2024-11-01       0.677391  0.393806 -4.561051e-02 -1.587556e-01 -1.102657   
2024-12-01      -0.306973  0.656894  2.140096e-01  3.896428e-01  0.197339   
2025-01-01       1.040122 -0.658546 -3.431704e-01 -4.468299e-01 -0.030556   
2025-02-01      -1.126504 -2.032450 -2.521521e-01 -3.642178e-01  0.586412   
2025-03-01       1.028436 -2.237074  1.385908e-18  2.387832e-18  0.356265   

             VIXCLSx  
Date                  
1959-03-01  0.000000  
1959-04-01  0.000000  
1959-05-01  0.000000  
1959-06-01  0.000000  
1959-07-01  0.000000  
...              ...  
2024-11-01 -0.481744  
2024-12-01 -0.522844  
2025-01-01 -0.360992  
2025-02-01 -0.323413  
2025-03-01  0.343984  

[793 rows x 126 columns]
In [22]:
nan_count_standardized = df_standardized.isna().sum()
print("NaN values per column:\n", nan_count_standardized[nan_count_standardized > 0])
NaN values per column:
 Series([], dtype: int64)

Factors¶

Starting from the data panel (with missing values initialized to zero), we estimate:

  • A matrix of factors:
    $$ F = (f_1, \dots, f_T)' \in \mathbb{R}^{T \times r} $$

  • A matrix of loadings:
    $$ \lambda = (\lambda_1, \dots, \lambda_N)' \in \mathbb{R}^{N \times r} $$

These are estimated using the normalization: $$ \lambda' \lambda / N = I_r $$

Where:

  • $T$ = number of time periods (rows),
  • $N$ = number of series (columns),
  • $r$ = number of factors.

The missing value for series i at time t is updated from zero to $\lambda'_i f_t$.

This is multiplied by the standard deviation of the series and the mean is re-added.

The resulting value is treated as an observation for series i at time t, and the mean and variance of the complete sample are re-calculated. The data are demeaned and standardized again, and the factors and loadings are re-estimated from the updated panel. The iteration stops when the factor estimates do not change.

$PC_p$ criteria developed in Bai and Ng (2002) is used, which is a generalization of Mallow’s $C_p$ criteria for large dimensional panels.

The penalty used is:

$$ (N+T)/N T \log(\text{min}(N,T)) $$

In [23]:
# 1. PCA with normalization λ'λ / N = I_r

def pca_stock_watson(X_std: np.ndarray, r: int):
    """
    Parameters
    ----------
    X_std : ndarray (T × N) – columns already demeaned and standardized
    r     : number of factors

    Returns
    -------
    F_hat      : (T × r)     – estimated factors
    Lambda_hat : (N × r)     – estimated loadings
    X_hat      : reconstruction F̂ Λ̂' (T × N)
    sing_vals  : singular values of X'X
    """
    T, N = X_std.shape
    U, s, _ = svd(X_std.T @ X_std, full_matrices=False)
    Lambda_hat = np.sqrt(N) * U[:, :r]
    F_hat      = (X_std @ Lambda_hat) / N
    X_hat      = F_hat @ Lambda_hat.T
    return F_hat, Lambda_hat, X_hat, s
In [24]:
# 2. Bai & Ng 2002 Criterion 

def pc_p2_criterion(X_std: np.ndarray, kmax: int = 15) -> int:
    """
    Returns r* ∈ {0, …, kmax} that minimizes the PC_p2 criterion.
    """
    T, N  = X_std.shape
    NT, NT1 = T * N, N + T
    log_pen = np.log(min(N, T))

    crit = np.empty(kmax + 1)
    for r in range(kmax + 1):
        if r == 0:
            X_hat = np.zeros_like(X_std)
        else:
            X_hat = pca_stock_watson(X_std, r)[2]

        sigma2 = np.sum((X_std - X_hat) ** 2) / NT
        pen    = 0 if r == 0 else (NT1 / NT) * log_pen * r
        crit[r] = np.log(sigma2) + pen

    return int(np.argmin(crit))
In [25]:
# 3. EM algorithm 

def em_factors(
    df_transformed:   pd.DataFrame,   # original NaNs
    df_missing:       pd.DataFrame,   # NaNs → unconditional mean (not standardized)
    df_standardized:  pd.DataFrame,   # z-score (demeaned + std), no NaNs
    *,
    kmax:         int   = 15,
    tol:          float = 1e-6,
    max_iter:     int   = 200,
    print_every:  int   = 10,
):
    """
    Python port of the MATLAB function `factors_em` (only DEMEAN=2).
    Returns:
        factors_df, loadings_df, r_star,
        X_filled_unstd_df, X_filled_std_df, X_hat_df
    """

    # setup 
    idx, cols = df_transformed.index, df_transformed.columns
    mask_nan  = df_transformed.isna().to_numpy()          # True on original NaNs

    X_std      = df_standardized.to_numpy(float)          # (T × N)
    X_hat_prev = np.zeros_like(X_std)
    err, it    = np.inf, 0

    #  EM loop 
    while err > tol and it < max_iter:
        it += 1

        # 1. select r* with PC_p2
        r_star = pc_p2_criterion(X_std, kmax)

        # 2. Stock-Watson PCA
        F_hat, Lambda_hat, X_hat, _ = pca_stock_watson(X_std, r_star)

        # 3. convergence criterion
        if it > 1:
            err = np.sum((X_hat - X_hat_prev) ** 2) / np.sum(X_hat_prev ** 2)
        X_hat_prev = X_hat.copy()

        if it == 1 or it % print_every == 0:
            print(f"Iter {it:3d} | r*={r_star:2d} | err={err:9.2e}")

        # 4. BACK-TRANSFORM with CURRENT mean/std 
        mean_curr = X_std.mean(axis=0)
        std_curr  = X_std.std(axis=0, ddof=0)
        std_curr[std_curr == 0] = 1.0

        X_unstd = X_std * std_curr + mean_curr          # original scale
        updates = X_hat * std_curr + mean_curr          # predictions in original scale
        X_unstd[mask_nan] = updates[mask_nan]           # replace only NaNs

        # 5. re-standardize for next iteration 
        mean_next = X_unstd.mean(axis=0)
        std_next  = X_unstd.std(axis=0, ddof=0)
        std_next[std_next == 0] = 1.0
        X_std = (X_unstd - mean_next) / std_next

    #  output 
    factors_df = pd.DataFrame(
        F_hat, index=idx, columns=[f"F{i+1}" for i in range(r_star)]
    )
    loadings_df = pd.DataFrame(
        Lambda_hat, index=cols, columns=[f"F{i+1}" for i in range(r_star)]
    )
    X_filled_unstd_df = pd.DataFrame(X_unstd, index=idx, columns=cols)
    X_filled_std_df   = pd.DataFrame(X_std,   index=idx, columns=cols)
    X_hat_df          = pd.DataFrame(X_hat,   index=idx, columns=cols)

    if it == max_iter:
        print(f" EM: maximum number of iterations reached ({max_iter}), err = {err:.2e}")
    else:
        print(f"Converged in {it} iterations (err = {err:.2e})")

    return (
        factors_df,
        loadings_df,
        r_star,
        X_filled_unstd_df,
        X_filled_std_df,
        X_hat_df,
    )
In [26]:
factors, loadings, r_opt, X_f, X_f_std, xhat = em_factors(
    df_transformed = df_transformed,   # original NaNs
    df_missing     = df_nan,           # NaNs → unconditional mean
    df_standardized= df_standardized,  # z-score
    kmax           = 15,               # same as in the MATLAB script
    tol            = 1e-6,
    max_iter       = 200,
    print_every    = 10                # log every 10 iterations
)
Iter   1 | r*= 7 | err=      inf
Iter  10 | r*= 7 | err= 2.08e-04
Iter  20 | r*= 7 | err= 3.80e-05
Iter  30 | r*= 7 | err= 9.02e-06
Iter  40 | r*= 7 | err= 2.57e-06
Converged in 49 iterations (err = 9.09e-07)
In [27]:
print("Optimal number of factors (r*) =", r_opt, "\n")

# First 5 rows of the factors (T × r)
print("=== FACTORS ===")
print(factors.head())

print("\n=== LOADINGS ===")
print(loadings.head())
Optimal number of factors (r*) = 7 

=== FACTORS ===
                  F1        F2        F3        F4        F5        F6  \
Date                                                                     
1959-03-01 -0.634074 -0.154981  0.012307 -0.109969 -0.135474  0.063457   
1959-04-01 -0.672058 -0.087580 -0.039489  0.025380 -0.150690  0.210907   
1959-05-01 -0.434015 -0.089333  0.030766  0.024450 -0.151005  0.072027   
1959-06-01 -0.181955 -0.056294 -0.147395  0.112033 -0.112040  0.050307   
1959-07-01  0.189512 -0.170787 -0.216749  0.020062 -0.254072 -0.007770   

                  F7  
Date                  
1959-03-01  0.079084  
1959-04-01 -0.067066  
1959-05-01 -0.047864  
1959-06-01  0.011220  
1959-07-01  0.139868  

=== LOADINGS ===
                       F1        F2        F3        F4        F5        F6  \
RPI             -0.804890 -0.522683  0.594732 -0.445156  0.311571 -0.102130   
W875RX1         -1.323651 -0.542498  0.824017 -0.514824  0.399597  0.181471   
DPCERA3M086SBEA -1.318999  0.374147  0.722967 -0.682692  0.372571 -1.507343   
CMRMTSPLx       -1.657474  0.268653  0.864031 -0.507607  0.558627 -0.834430   
RETAILx         -1.366324  0.783677  0.184843 -0.477917  0.653177 -1.274763   

                       F7  
RPI              0.136943  
W875RX1          0.345291  
DPCERA3M086SBEA  0.468549  
CMRMTSPLx        0.250298  
RETAILx          0.634052  
In [28]:
# Number of factors (i.e., columns of the DataFrame)
num_factors = factors.shape[1]
colors = plt.get_cmap('tab10').colors  # palette with at least 10 colors

# Create a figure with as many subplots as rows (i.e., number of factors)
fig, axes = plt.subplots(nrows=num_factors, ncols=1, figsize=(12, 2.5 * num_factors), sharex=False, dpi = 600)

for i, ax in enumerate(axes):
    col = colors[i % len(colors)]
    ax.plot(factors.index, factors.iloc[:, i], color=col)
    ax.set_ylabel(f'F{i+1}')
    ax.grid(False)

axes[-1].set_xlabel('Date')
fig.tight_layout()
fig.suptitle("Factors", fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
No description has been provided for this image

NaN have been filled¶

Also NaN values, now, have been filled thanks to this process

Now let's check if the procedure has succeed, running the line below should show False

In [29]:
X_f.isna().any().any()
Out[29]:
False

And we also have its standardized version X_std

In [30]:
X_f_std.isna().any().any()
Out[30]:
False

Regression, $R^2$, and $mR^2$¶

Cumulative regression for each series

For each series $x_i$ (i.e., each column of the dataset, such as inflation, employment, etc.), do the following:

  1. Regression on $F1$

$$ x_{t,i} = \alpha_i + \beta_{i,1} F_{1,t} + \varepsilon_{i,t} $$

→ Let's save $R_i(1)^{2}$

  1. Regression on $F1 + F2$

$$ x_{t,i} = \alpha_i + \beta_{i,1} F_{1,t} + \beta_{i,2} F_{2,t} + \varepsilon_{i,t} $$

→ Let's save $R_i(2)^{2}$

  1. Regression on $F1 + F2 + F3$

$$ x_{t,i} = \alpha_i + \beta_{i,1} F_{1,t} + \beta_{i,2} F_{2,t} + \beta_{i,t} F_{3,t} + \varepsilon_{i,t} $$

→ Let's save $R_i(3)^{2}$

  1. we continue up to $r$, the optimal number of factors

NOTE:

lstsq solves solves the least squares problem but it is numerically stable and optimized and handles multiple dependent variables in a single call.

It avoids manual matrix inversion.

In [31]:
T, N = X_f_std.shape
r = factors.shape[1]

X_np_std = X_f_std.to_numpy()
F_np = factors.to_numpy()
columns = X_f_std.columns

y_mean = np.mean(X_np_std, axis=0)
SST = np.sum((X_np_std - y_mean)**2, axis=0)

# Initialize DataFrame with index = variable names
R2_matrix = pd.DataFrame(index=columns)

# Loop over k factors
for k in range(1, r + 1):
    F_k = F_np[:, :k]  # first k factors
    beta_hat, *_ = np.linalg.lstsq(F_k, X_np_std, rcond=None)
    y_hat = F_k @ beta_hat
    SSR = np.sum((y_hat - y_mean)**2, axis=0)
    R2 = SSR / SST

    # Add column for cumulative R² with k factors
    R2_matrix[f'R2_F{k}'] = R2

Now we have a matrix R2_matrix containing the $R^2$ (computed as stated before) on the columns and the variables on the rows

In [32]:
print(R2_matrix.head())
                    R2_F1     R2_F2     R2_F3     R2_F4     R2_F5     R2_F6  \
RPI              0.110971  0.131306  0.155288  0.165859  0.170515  0.170852   
W875RX1          0.299966  0.321856  0.367886  0.382022  0.389687  0.390745   
DPCERA3M086SBEA  0.297816  0.308254  0.343691  0.368542  0.375204  0.448385   
CMRMTSPLx        0.470080  0.475465  0.526062  0.539802  0.554758  0.577185   
RETAILx          0.319538  0.365299  0.367627  0.379809  0.400234  0.452580   

                    R2_F7  
RPI              0.171356  
W875RX1          0.393956  
DPCERA3M086SBEA  0.454303  
CMRMTSPLx        0.578868  
RETAILx          0.463431  

Compute marginals for each series

After obtaining all $R_i(k)^{2}$, compute for each $k$ and each series:

  • $mR_i(1)^{2} = R_i(1)^{2}$
  • $mR_i(2)^{2} = R_i(2)^{2} - R_i(1)^{2}$
  • $mR_i(3)^{2} = R_i(3)^{2} - R_i(2)^{2}$
  • ... up to $mR_i(r)^{2}$
In [33]:
mR2_matrix = R2_matrix.copy()

# Marginals R²
mR2_matrix.iloc[:, 1:] = R2_matrix.iloc[:, 1:].values - R2_matrix.iloc[:, :-1].values
In [34]:
print(mR2_matrix.head())
                    R2_F1     R2_F2     R2_F3     R2_F4     R2_F5     R2_F6  \
RPI              0.110971  0.020335  0.023983  0.010571  0.004656  0.000337   
W875RX1          0.299966  0.021890  0.046030  0.014137  0.007664  0.001058   
DPCERA3M086SBEA  0.297816  0.010438  0.035437  0.024850  0.006662  0.073181   
CMRMTSPLx        0.470080  0.005385  0.050597  0.013741  0.014956  0.022427   
RETAILx          0.319538  0.045761  0.002328  0.012182  0.020425  0.052346   

                    R2_F7  
RPI              0.000504  
W875RX1          0.003211  
DPCERA3M086SBEA  0.005918  
CMRMTSPLx        0.001684  
RETAILx          0.010851  

Average per factor

Finally, for each factor $k$, compute:

$$ mR(k)^{2} = \frac{1}{N} \sum_{i=1}^{N} mR_i(k)^{2} $$

In [35]:
# Mean for each factor (i.e., by column)
mR2_average = mR2_matrix.mean(axis=0)

mR2_average.name = 'mean_mR2'
In [36]:
print(mR2_average)
R2_F1    0.171063
R2_F2    0.074508
R2_F3    0.067727
R2_F4    0.053311
R2_F5    0.047765
R2_F6    0.032216
R2_F7    0.027055
Name: mean_mR2, dtype: float64

Series that “load the most” on each factor

For each factor $k$:

  • sort the series by $mR_i(k)^{2}$
  • take the top 10
In [37]:
top10_by_factor = {}

for col in mR2_matrix.columns:
    top10_series = mR2_matrix[col].sort_values(ascending=False).head(10)
    top10_by_factor[col] = top10_series
In [38]:
print("Top 10 series for F1:")
print(top10_by_factor['R2_F1'])
Top 10 series for F1:
IPMANSICS    0.802027
PAYEMS       0.783759
INDPRO       0.761174
IPFPNSS      0.756845
CUMFNS       0.749975
USGOOD       0.742252
MANEMP       0.690483
IPFINAL      0.688075
DMANEMP      0.645203
IPDMAT       0.625748
Name: R2_F1, dtype: float64

Total Variation Explained by the factors

In [39]:
xhat_np = xhat.to_numpy(float)      

# Total Variation Explained
SSE = np.square(X_np_std - xhat_np).sum()   # Resigual sum of squares
SST = np.square(X_np_std).sum()             # Total square sum

TVE = 1 - SSE / SST
print(f"Total Variation Explained: {TVE:.4f}")
Total Variation Explained: 0.4736

Table¶

In [40]:
# Number of factors
r = len(top10_by_factor)

# List of DataFrames (one per factor)
col_blocks = []

for k in range(1, r + 1):
    factor_label = f'F{k}'
    col_label = f'R2_F{k}'
    col_name = f'mR²({k})'

    # Top 10 series and values
    top10 = top10_by_factor[col_label]

    # Create the column for this factor
    col_df = pd.DataFrame({
        (col_name, 'Series'): [''] + list(top10.index),
        (col_name, 'mR²'): [f"{mR2_average[col_label]:.3f}"] + [f"{v:.3f}" for v in top10.values]
    })

    col_blocks.append(col_df)

# Concatenate all columns (axis=1 → side-by-side columns)
table_df = pd.concat(col_blocks, axis=1)

# Assign index: 'mean', '1', ..., '10'
table_df.index = ['mean'] + [str(i) for i in range(1, 11)]
In [41]:
print(f"Total Variation Explained: {TVE:.3f}")
HTML(table_df.to_html(escape=False))
Total Variation Explained: 0.474
Out[41]:
mR²(1) mR²(2) mR²(3) mR²(4) mR²(5) mR²(6) mR²(7)
Series mR² Series mR² Series mR² Series mR² Series mR² Series mR² Series mR²
mean 0.171 0.075 0.068 0.053 0.048 0.032 0.027
1 IPMANSICS 0.802 CUSR0000SAC 0.601 BAAFFM 0.417 GS1 0.618 T1YFFM 0.538 AWHMAN 0.365 S&P 500 0.444
2 PAYEMS 0.784 CUSR0000SA0L2 0.587 AAAFFM 0.414 GS5 0.612 TB6SMFFM 0.517 CES0600000007 0.362 S&P div yield 0.324
3 INDPRO 0.761 DNDGRG3M086SBEA 0.562 T10YFFM 0.377 GS10 0.558 T5YFFM 0.491 UEMP15OV 0.315 S&P PE ratio 0.253
4 IPFPNSS 0.757 CPIAUCSL 0.556 T5YFFM 0.315 TB6MS 0.550 TB3SMFFM 0.456 UEMP27OV 0.173 UMCSENTx 0.177
5 CUMFNS 0.750 CPITRNSL 0.546 HOUST 0.310 AAA 0.526 T10YFFM 0.418 BUSINVx 0.143 VIXCLSx 0.167
6 USGOOD 0.742 CUSR0000SA0L5 0.533 PERMIT 0.260 BAA 0.448 AAAFFM 0.341 S&P PE ratio 0.142 EXCAUSx 0.120
7 MANEMP 0.690 PCEPI 0.520 HOUSTW 0.258 TB3MS 0.447 BAAFFM 0.269 UEMP15T26 0.140 NONBORRES 0.085
8 IPFINAL 0.688 CPIULFSL 0.483 PERMITW 0.257 CP3Mx 0.431 COMPAPFFx 0.247 UEMPMEAN 0.118 TOTRESNS 0.081
9 DMANEMP 0.645 WPSFD49502 0.424 HOUSTS 0.237 TWEXAFEGSMTHx 0.227 PERMIT 0.231 MANEMP 0.088 BOGMBASE 0.079
10 IPDMAT 0.626 WPSFD49207 0.407 TB3SMFFM 0.233 FEDFUNDS 0.173 PERMITW 0.211 USWTRADE 0.087 TB3MS 0.060

Table_ADV_ECON.png

LaTex¶

If you want to produce the tabe shown before you must run thi cell which will generate a LaTex code that you can paste in your LaTex script

NOTE

This code is dynamic in such a way you can just run the cell below with the results obtained from the code

In [42]:
def texify(s: str) -> str:
    """Escape &, _, %, # and preserve spaces."""
    return re.sub(r'([&_#%])', r'\\\1', s)

# Complete DataFrame 
r = len(top10_by_factor) 

headers = []
for k in range(1, r + 1):
    h = rf'\emph{{$mR^2({k})$}}'
    headers.extend([(h, 'Series'), (h, r'$mR^2$')])

cols = pd.MultiIndex.from_tuples(headers)
idx  = ['mean'] + list(map(str, range(1, 11)))
big  = pd.DataFrame('', index=idx, columns=cols)

# “mean” row
for k in range(1, r + 1):
    key = f'R2_F{k}'
    big.loc['mean', (rf'\emph{{$mR^2({k})$}}', r'$mR^2$')] = f'{mR2_average[key]:.3f}'

# top-10 rows
for k in range(1, r + 1):
    key = f'R2_F{k}'
    for i, (name, val) in enumerate(top10_by_factor[key].items(), start=1):
        big.loc[str(i), (rf'\emph{{$mR^2({k})$}}', 'Series')] = texify(name)
        big.loc[str(i), (rf'\emph{{$mR^2({k})$}}', r'$mR^2$')] = f'{val:.3f}'

# split (1-4) / (5-7) 
left   = big.iloc[:, :8]     # 4 factors × 2 columns
right  = big.iloc[:, 8:]     # 3 factors × 2 columns

colfmt_left  = 'l' + (' l r' * 4)
colfmt_right = 'l' + (' l r' * 3)

latex_left = left.to_latex(
    escape=False, multicolumn=True, multicolumn_format='c',
    index=True, column_format=colfmt_left
)
latex_right = right.to_latex(
    escape=False, multicolumn=True, multicolumn_format='c',
    index=True, column_format=colfmt_right
)

with open('factor_marginals_left.tex',  'w') as f:
    f.write(latex_left)
with open('factor_marginals_right.tex', 'w') as f:
    f.write(latex_right)

# macro with the TVE 
TVE_str = f"{TVE:.3f}"                      
with open('TVE_value.tex', 'w') as f:
    f.write(rf'\newcommand{{\TVEValue}}{{{TVE_str}}}')

print("Created: factor_marginals_left.tex, factor_marginals_right.tex, TVE_value.tex")
Created: factor_marginals_left.tex, factor_marginals_right.tex, TVE_value.tex

Figure 1 in the paper¶

In [43]:
group_dict = {
    1: ["RPI","W875RX1","INDPRO","IPFPNSS","IPFINAL","IPCONGD","IPDCONGD",
        "IPNCONGD","IPBUSEQ","IPMAT","IPDMAT","IPNMAT","IPMANSICS","IPB51222s",
        "IPFUELS","NAPMPI","CUMFNS"],
    2: ["HWI","HWIURATIO","CLF16OV","CE16OV","UNRATE","UEMPMEAN","UEMPLT5",
        "UEMP5TO14","UEMP15OV","UEMP15T26","UEMP27OV","CLAIMSx","PAYEMS",
        "USGOOD","CES1021000001","USCONS","MANEMP","DMANEMP","NDMANEMP",
        "SRVPRD","USTPU","USWTRADE","USTRADE","USFIRE","USGOVT",
        "CES0600000007","AWOTMAN","AWHMAN","NAPMEI","CES0600000008",
        "CES2000000008","CES3000000008"],
    3: ["HOUST","HOUSTNE","HOUSTMW","HOUSTS","HOUSTW",
        "PERMIT","PERMITNE","PERMITMW","PERMITS","PERMITW"],
    4: ["DPCERA3M086SBEA","CMRMTSPLx","RETAILx","NAPM","NAPMNOI","NAPMSDI",
        "NAPMII","ACOGNO","AMDMNOx","ANDENOx","AMDMUOx","BUSINVx",
        "ISRATIOx","UMCSENTx"],
    5: ["M1SL","M2SL","M2REAL","AMBSL","TOTRESNS","NONBORRES","BUSLOANS",
        "REALLN","NONREVSL","CONSPI","MZMSL","DTCOLNVHFNM","DTCTHFNM","INVEST"],
    6: ["FEDFUNDS","CP3Mx","TB3MS","TB6MS","GS1","GS5","GS10","AAA","BAA",
        "COMPAPFFx","TB3SMFFM","TB6SMFFM","T1YFFM","T5YFFM","T10YFFM",
        "AAAFFM","BAAFFM","TWEXMMTH","EXSZUSx","EXJPUSx","EXUSUKx","EXCAUSx"],
    7: ["PPIFGS","PPIFCG","PPIITM","PPICRM","OILPRICEx","PPICMM","NAPMPRI",
        "CPIAUCSL","CPIAPPSL","CPITRNSL","CPIMEDSL","CUSR0000SAC","CUUR0000SAD",
        "CUSR0000SAS","CPIULFSL","CUUR0000SA0L2","CUSR0000SA0L5","PCEPI",
        "DDURRG3M086SBEA","DNDGRG3M086SBEA","DSERRG3M086SBEA"],
    8: ["S&P 500","S&P: indust","S&P div yield","S&P PE ratio"],
}
In [44]:
def norm(name: str) -> str:
    """Lowercase, letters/digits only (removes spaces, :, &, etc.)."""
    return re.sub(r'[^a-z0-9]', '', name.lower())


R2_last = R2_matrix.iloc[:, -1]  # R² with r_opt factors
R2_last_norm = R2_last.copy()
R2_last_norm.index = R2_last_norm.index.map(norm)

# Mapping norm_name → original name (for optional labels)
norm_to_original = {norm(v): v for g in group_dict.values() for v in g}

# ---------------------------------------------
# Building ordered lists
# ---------------------------------------------
ordered_vars, group_codes, missing = [], [], []

for g in range(1, 9):  # keep group order 1…8
    for v in group_dict[g]:
        key = norm(v)
        if key in R2_last_norm.index:
            ordered_vars.append(key)
            group_codes.append(g)
        else:
            missing.append(v)  # for debugging only

if missing:
    print("[WARN] Series not found in R2_matrix:")
    print(missing)

R2_ord = R2_last_norm.loc[ordered_vars].to_numpy()
groups_ord = np.array(group_codes)
labels_ord = [norm_to_original[k] for k in ordered_vars]

# ---------------------------------------------
# Palette and colors for bar chart
# ---------------------------------------------
cmap   = plt.colormaps["tab10"]  # Modern API
colors = cmap((groups_ord - 1) % 10)

# ---------------------------------------------
# Plot
# ---------------------------------------------
fig, ax = plt.subplots(figsize=(13, 4), dpi=300)
ax.bar(range(len(R2_ord)), R2_ord, color=colors, edgecolor="black", lw=.15)

ax.set_ylabel("R²")
ax.set_ylim(0, 1)
ax.set_title(f"R²({r_opt}) ordered by group")

# Group separation and alternating bands
ticks, labels = [], []
start = 0
for g in range(1, 9):
    size = group_codes.count(g)
    if size == 0:
        continue
    end = start + size - 1

    # grey band for even groups
    if g % 2 == 0:
        ax.axvspan(start-0.5, end+0.5, color="lightgrey", alpha=.20)

    # separator line
    ax.axvline(end + 0.5, color="black", lw=1.6)

    ticks.append((start + end) / 2)
    labels.append(f"Group {g}")
    start += size

ax.set_xticks(ticks)
ax.set_xticklabels(labels, rotation=35, ha="right")
ax.set_xlim(-0.5, len(R2_ord) - 0.5)
plt.tight_layout()
plt.show()
[WARN] Series not found in R2_matrix:
['NAPMPI', 'NAPMEI', 'NAPM', 'NAPMNOI', 'NAPMSDI', 'NAPMII', 'AMBSL', 'MZMSL', 'TWEXMMTH', 'PPIFGS', 'PPIFCG', 'PPIITM', 'PPICRM', 'NAPMPRI', 'CUUR0000SAD', 'CUUR0000SA0L2', 'S&P: indust']
No description has been provided for this image

Forecast¶

Now the paths of the paper and the Professor diverge.

The professor asks:

  1. Using the best factor model, I would like to see the forecast for Industrial Production (the level, not the growth) and Inflation (the first difference of CPI) for the next six months.

  2. For these two series, I would also like to see an evaluation of the forecast using a rolling windows of data. If you feel like, you can do a real time evaluation, by using the same data a researcher would use if she were to run the model in the last period of the estimation window.

1. Six - Month Forecast¶

In [45]:
print("NaN in CPIAUCSL:", df["CPIAUCSL"].isna().sum())
print("NaN in INDPRO:", df["INDPRO"].isna().sum())
NaN in CPIAUCSL: 0
NaN in INDPRO: 0

Since the original series don't have NaN we can avoid re-contructing the series. We can use the original ones

In [46]:
Y_df_6m = df[["Date", "CPIAUCSL", "INDPRO"]].copy()
In [47]:
# Date as index
Y_df_6m["Date"] = pd.to_datetime(Y_df_6m["Date"])
Y_df_6m = Y_df_6m.set_index("Date")
In [48]:
print(Y_df_6m.head())
            CPIAUCSL   INDPRO
Date                         
1959-01-01     29.01  21.9616
1959-02-01     29.00  22.3917
1959-03-01     28.97  22.7142
1959-04-01     28.98  23.1981
1959-05-01     29.04  23.5476

Now let's compute the first difference fo CPIAUCSL and remove also first row

In [49]:
# Compute the first difference on CPIAUCSL
Y_df_6m["diff_CPIAUCSL"] = Y_df_6m["CPIAUCSL"].diff()

# Remove the first two rows with NaN (and keep other values consistent) and KEEP CONSISTENT WITH factors
Y_df_6m = Y_df_6m.iloc[2:]
Y_df_6m = Y_df_6m.drop(columns=["CPIAUCSL"])
In [50]:
print(Y_df_6m)
              INDPRO  diff_CPIAUCSL
Date                               
1959-03-01   22.7142         -0.030
1959-04-01   23.1981          0.010
1959-05-01   23.5476          0.060
1959-06-01   23.5744          0.070
1959-07-01   23.0099          0.040
...              ...            ...
2024-11-01  101.9619          0.885
2024-12-01  103.1177          1.154
2025-01-01  103.3418          1.483
2025-02-01  104.2202          0.689
2025-03-01  103.8892         -0.160

[793 rows x 2 columns]
In [51]:
print(factors.head())
                  F1        F2        F3        F4        F5        F6  \
Date                                                                     
1959-03-01 -0.634074 -0.154981  0.012307 -0.109969 -0.135474  0.063457   
1959-04-01 -0.672058 -0.087580 -0.039489  0.025380 -0.150690  0.210907   
1959-05-01 -0.434015 -0.089333  0.030766  0.024450 -0.151005  0.072027   
1959-06-01 -0.181955 -0.056294 -0.147395  0.112033 -0.112040  0.050307   
1959-07-01  0.189512 -0.170787 -0.216749  0.020062 -0.254072 -0.007770   

                  F7  
Date                  
1959-03-01  0.079084  
1959-04-01 -0.067066  
1959-05-01 -0.047864  
1959-06-01  0.011220  
1959-07-01  0.139868  

As in the paper we use the BIC to choose best lag both for dependent variable Y and iindependent variables X's

In [52]:
# Create lags of the variables
def create_lagged_matrix(series: pd.DataFrame, max_lag: int, prefix: str = None) -> pd.DataFrame:
    lagged_data = {}
    for col in series.columns:
        for lag in range(1, max_lag + 1):
            col_name = f"{prefix}_{col}_lag{lag}" if prefix else f"{col}_lag{lag}"
            lagged_data[col_name] = series[col].shift(lag)
    return pd.DataFrame(lagged_data, index=series.index)


# Evaluate all lag combinations for Y and factors
def evaluate_lags(Y_df: pd.DataFrame, target_col: str,
                  factors: pd.DataFrame,
                  max_lag_y: int = 6,
                  max_lag_f: int = 6) -> pd.DataFrame:

    results = []

    for p, m in product(range(max_lag_y + 1), range(1, max_lag_f + 1)):
        y = Y_df[[target_col]].copy()
        y.columns = ['Y']  # target at t+1

        # Lags of Y (from 1 to p)
        if p > 0:
            y_lags = create_lagged_matrix(Y_df[[target_col]], p, prefix='Y')
        else:
            y_lags = pd.DataFrame(index=Y_df.index)

        # Lags of the factors (from 1 to m)
        f_lags = create_lagged_matrix(factors, m, prefix='F')

        # Build final dataset
        X = pd.concat([y_lags, f_lags], axis=1)
        Y_target = y.shift(-1)  # target at t+1
        data = pd.concat([Y_target, X], axis=1).dropna()

        if data.empty:
            continue

        y_final = data['Y']
        X_final = sm.add_constant(data.drop(columns='Y'))

        model = sm.OLS(y_final, X_final).fit()
        results.append({
            'lag_Y': p,
            'lag_F': m,
            'AIC': model.aic,
            'BIC': model.bic,
            'R2': model.rsquared
        })

    return pd.DataFrame(results).sort_values(by='BIC').reset_index(drop=True)

INDPRO¶

In [53]:
results_IND = evaluate_lags(Y_df_6m, target_col="INDPRO", factors=factors,
                        max_lag_y=6, max_lag_f=6)
In [54]:
bic_6m_IND = results_IND.sort_values(by='BIC').head(5)
print("MIGLIORI MODELLI SECONDO BIC")
print(bic_6m_IND[['lag_Y', 'lag_F', 'BIC', 'R2']].to_string(index=False))
MIGLIORI MODELLI SECONDO BIC
 lag_Y  lag_F         BIC       R2
     3      1 2375.467375 0.998480
     6      1 2375.729966 0.998489
     1      1 2378.052052 0.998467
     4      1 2378.771488 0.998477
     2      1 2379.500400 0.998468

diff_CPIAUCSL¶

In [55]:
results_CPI = evaluate_lags(Y_df_6m, target_col="diff_CPIAUCSL", factors=factors,
                        max_lag_y=6, max_lag_f=6)
In [56]:
bic_6m_CPI = results_CPI.sort_values(by='BIC').head(5)
print("MIGLIORI MODELLI SECONDO BIC")
print(bic_6m_CPI[['lag_Y', 'lag_F', 'BIC', 'R2']].to_string(index=False))
MIGLIORI MODELLI SECONDO BIC
 lag_Y  lag_F         BIC       R2
     6      1 1083.646067 0.178863
     5      1 1097.017813 0.158363
     4      1 1098.018519 0.150947
     3      1 1099.174212 0.143283
     1      1 1112.052241 0.115923

Forecast INDPRO¶

Let's create the optimal number of lags choosen

In [57]:
# Create lags of INDPRO up to the third lag
indpro_lags = create_lagged_matrix(Y_df_6m[["INDPRO"]], max_lag=3)
indpro_lags.dropna(inplace=True)

# Create lags of the factors up to the first lag
factors_lags_IND = create_lagged_matrix(factors, max_lag=1)
factors_lags_IND.dropna(inplace=True)
In [58]:
final_IND = indpro_lags.join(factors_lags_IND, how='inner')
In [59]:
final_IND = Y_df_6m['INDPRO'].to_frame().join(final_IND, how='inner')
In [60]:
print(final_IND)
              INDPRO  INDPRO_lag1  INDPRO_lag2  INDPRO_lag3   F1_lag1  \
Date                                                                    
1959-06-01   23.5744      23.5476      23.1981      22.7142 -0.434015   
1959-07-01   23.0099      23.5744      23.5476      23.1981 -0.181955   
1959-08-01   22.2304      23.0099      23.5744      23.5476  0.189512   
1959-09-01   22.2035      22.2304      23.0099      23.5744  0.765563   
1959-10-01   22.0422      22.2035      22.2304      23.0099 -0.010836   
...              ...          ...          ...          ...       ...   
2024-11-01  101.9619     102.2138     102.5954     103.0196  0.222903   
2024-12-01  103.1177     101.9619     102.2138     102.5954  0.026669   
2025-01-01  103.3418     103.1177     101.9619     102.2138 -0.141360   
2025-02-01  104.2202     103.3418     103.1177     101.9619 -0.035340   
2025-03-01  103.8892     104.2202     103.3418     103.1177 -0.009237   

             F2_lag1   F3_lag1   F4_lag1   F5_lag1   F6_lag1   F7_lag1  
Date                                                                    
1959-06-01 -0.089333  0.030766  0.024450 -0.151005  0.072027 -0.047864  
1959-07-01 -0.056294 -0.147395  0.112033 -0.112040  0.050307  0.011220  
1959-08-01 -0.170787 -0.216749  0.020062 -0.254072 -0.007770  0.139868  
1959-09-01 -0.100290 -0.384532  0.009054 -0.433862 -0.171845  0.182504  
1959-10-01  0.072548 -0.226914  0.337323 -0.243761 -0.209014 -0.143375  
...              ...       ...       ...       ...       ...       ...  
2024-11-01  0.002868 -0.166846  0.160544  0.089281 -0.062023  0.089715  
2024-12-01 -0.018730 -0.052244  0.147181  0.083626 -0.002342  0.043366  
2025-01-01  0.083604 -0.058928 -0.003179  0.116681 -0.048115 -0.035268  
2025-02-01 -0.002955 -0.125281  0.149342 -0.003125  0.139905  0.015350  
2025-03-01 -0.160394  0.104215 -0.116207  0.098661  0.023124 -0.008177  

[790 rows x 11 columns]

OLS¶

In [61]:
# Extract Y and X
Y_IND = final_IND.iloc[:, 0].values.reshape(-1, 1)     # INDPRO
X_IND_raw = final_IND.iloc[:, 1:]                      # All others as regressors

# Add constant (column of 1s)
X_IND = np.hstack([np.ones((X_IND_raw.shape[0], 1)), X_IND_raw.values])
var_names_IND = ['const'] + list(X_IND_raw.columns)

# OLS estimation: β = (X'X)^(-1) X'Y
beta_IND = np.linalg.inv(X_IND.T @ X_IND) @ (X_IND.T @ Y_IND)

# Predictions
Y_hat_IND = X_IND @ beta_IND

# Residuals
residuals_IND = Y_IND - Y_hat_IND

# R²
ss_total_IND = np.sum((Y_IND - np.mean(Y_IND)) ** 2)
ss_resid_IND = np.sum(residuals_IND ** 2)
r_squared_IND = 1 - (ss_resid_IND / ss_total_IND)

# Output with variable names
coef_df_IND = pd.DataFrame({
    'Variable': var_names_IND,
    'Beta': beta_IND.flatten()
})
In [62]:
print("Coefficienti stimati (beta_IND):")
print(coef_df_IND.to_string(index=False))
print(f"\nR²_IND: {r_squared_IND:.6f}")
Coefficienti stimati (beta_IND):
   Variable      Beta
      const  0.262131
INDPRO_lag1  1.032763
INDPRO_lag2 -0.178734
INDPRO_lag3  0.143785
    F1_lag1 -0.604878
    F2_lag1  0.315490
    F3_lag1 -0.002732
    F4_lag1  0.194542
    F5_lag1 -0.378690
    F6_lag1 -0.071213
    F7_lag1  1.246587

R²_IND: 0.999420

Assumption: Persistence of Factors¶

We assume that the values of the factors remain constant over the forecast horizon.
In other words, the lagged values of $F1–F7$ at time $T$ are reused for all future periods $T+1$ to $T+6$.

  • Fast and simple: no need to estimate additional models.
  • Not professional for extremely accuracy forecast
In [63]:
p_IND = 3         # lag of INDPRO
m_IND = 1         # lag of factors

pred_cols_IND = (
    [f"INDPRO_lag{i}" for i in range(1, p_IND + 1)] +
    [f"F{i}_lag{m_IND}" for i in range(1, 8)]
)

# 2) h-STEP FORECAST FUNCTION FOR INDPRO

def forecast_INDPRO(df_ind: pd.DataFrame,
                    beta_vec: np.ndarray,
                    h: int = 6) -> pd.Series:
    
    preds, dates = [], []

    last_obs  = df_ind.iloc[-1].copy()          # last observation
    last_date = pd.to_datetime(df_ind.index[-1])
    const     = 1.0

    for step in range(1, h + 1):
        X_in   = np.concatenate([[const], last_obs[pred_cols_IND].values])
        y_pred = float(X_in @ beta_vec)         # forecast INDPRO
        preds.append(y_pred)
        dates.append(last_date + pd.offsets.MonthBegin(step))

        # update the 3 lags of INDPRO
        for lag in range(p_IND, 1, -1):
            last_obs[f"INDPRO_lag{lag}"] = last_obs[f"INDPRO_lag{lag-1}"]
        last_obs["INDPRO_lag1"] = last_obs["INDPRO"]
        last_obs["INDPRO"]      = y_pred           # new value at t

        # fixed factors (persistence)
        # Fi_lag1 remain unchanged → no update

    return pd.Series(preds, index=dates, name="INDPRO_forecast")


# 3) DATAFRAME WITH REQUIRED COLUMNS FOR FORECAST
final_IND_sel = final_IND[["INDPRO"] + pred_cols_IND]

# 4) 6-MONTH FORECAST
forecast_6m_IND = forecast_INDPRO(
    final_IND_sel,
    beta_IND.flatten(),        # vector (k+1,)
    h=6
)
In [64]:
print("\nINDPRO forecasts for the next 6 months:")
print(forecast_6m_IND.round(3))
INDPRO forecasts for the next 6 months:
2025-04-01    104.136
2025-05-01    103.669
2025-06-01    104.109
2025-07-01    103.536
2025-08-01    104.109
2025-09-01    103.371
Name: INDPRO_forecast, dtype: float64
In [65]:
# 1) Last 12 months of observed INDPRO

actual_last12_IND = final_IND["INDPRO"].iloc[-12:]

# 2) Forecast series for 6 months
fcst_IND = forecast_6m_IND

# 3) Combine observed + forecast (long format for plotting)

plot_series_IND = (
    pd.concat(
        {"INDPRO (observed)": actual_last12_IND,
         "INDPRO (forecast)":  fcst_IND}
    )
    .to_frame(name="value")
    .reset_index()
    .rename(columns={"level_0": "Series",
                     "level_1": "Date"})
)

# 4) Plot with "bridge" between last observed and first forecast

fig, ax = plt.subplots(figsize=(8, 4), dpi=500)

forecast_color = None  # will be stored on the fly

for lbl, grp in plot_series_IND.groupby("Series"):
    line, = ax.plot(grp["Date"], grp["value"],
                    marker="" if "forecast" in lbl else None,
                    linestyle="-",
                    label=lbl)
    if "forecast" in lbl:            # store the forecast line color
        forecast_color = line.get_color()

# — bridge segment in same color as forecast —
last_obs_date   = actual_last12_IND.index[-1]
last_obs_value  = actual_last12_IND.iloc[-1]
first_fcst_date = fcst_IND.index[0]
first_fcst_val  = fcst_IND.iloc[0]

ax.plot([last_obs_date, first_fcst_date],
        [last_obs_value, first_fcst_val],
        color=forecast_color, linestyle="-", linewidth=1.2)

# — plot details —
ax.set_title("INDPRO: last 12 months observed + 6-month forecast")
ax.set_ylabel("Level (index 2017=100)")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

Forecast CPI¶

In [66]:
cpi_lags = create_lagged_matrix(Y_df_6m[["diff_CPIAUCSL"]], max_lag=6)  # diff_CPIAUCSL_lag1 … lag6
cpi_lags.dropna(inplace=True)

factors_lags_CPI = create_lagged_matrix(factors, max_lag=1)             # F1_lag1 … F7_lag1
factors_lags_CPI.dropna(inplace=True)

final_CPI = cpi_lags.join(factors_lags_CPI, how="inner")
final_CPI = Y_df_6m["diff_CPIAUCSL"].to_frame().join(final_CPI, how="inner")
In [67]:
final_CPI.columns = [
    col.replace("diff_CPIAUCSL", "diff_CPIAUCSL_CPI")       
    if col.startswith("diff_CPIAUCSL") else f"{col}_CPI"    
    for col in final_CPI.columns
]
In [68]:
print(final_CPI.head())
            diff_CPIAUCSL_CPI  diff_CPIAUCSL_CPI_lag1  diff_CPIAUCSL_CPI_lag2  \
Date                                                                            
1959-09-01               0.07                    0.03                    0.04   
1959-10-01               0.10                    0.07                    0.03   
1959-11-01               0.00                    0.10                    0.07   
1959-12-01               0.06                    0.00                    0.10   
1960-01-01              -0.04                    0.06                    0.00   

            diff_CPIAUCSL_CPI_lag3  diff_CPIAUCSL_CPI_lag4  \
Date                                                         
1959-09-01                    0.07                    0.06   
1959-10-01                    0.04                    0.07   
1959-11-01                    0.03                    0.04   
1959-12-01                    0.07                    0.03   
1960-01-01                    0.10                    0.07   

            diff_CPIAUCSL_CPI_lag5  diff_CPIAUCSL_CPI_lag6  F1_lag1_CPI  \
Date                                                                      
1959-09-01                    0.01                   -0.03     0.765563   
1959-10-01                    0.06                    0.01    -0.010836   
1959-11-01                    0.07                    0.06     0.317555   
1959-12-01                    0.04                    0.07     0.020195   
1960-01-01                    0.03                    0.04    -1.387339   

            F2_lag1_CPI  F3_lag1_CPI  F4_lag1_CPI  F5_lag1_CPI  F6_lag1_CPI  \
Date                                                                          
1959-09-01    -0.100290    -0.384532     0.009054    -0.433862    -0.171845   
1959-10-01     0.072548    -0.226914     0.337323    -0.243761    -0.209014   
1959-11-01    -0.215715    -0.095061    -0.017248    -0.114601    -0.077012   
1959-12-01    -0.164277    -0.029766     0.019729    -0.067122     0.054608   
1960-01-01     0.008380     0.257560     0.035697     0.034781     0.019750   

            F7_lag1_CPI  
Date                     
1959-09-01     0.182504  
1959-10-01    -0.143375  
1959-11-01     0.006941  
1959-12-01    -0.179049  
1960-01-01    -0.325786  
In [69]:
Y_CPI = final_CPI.iloc[:, 0].values.reshape(-1, 1)      # first column = target
X_CPI_raw = final_CPI.iloc[:, 1:]                       # all others = regressors

X_CPI = np.hstack([np.ones((X_CPI_raw.shape[0], 1)), X_CPI_raw.values])
var_names_CPI = ["const"] + list(X_CPI_raw.columns)

beta_CPI = np.linalg.inv(X_CPI.T @ X_CPI) @ (X_CPI.T @ Y_CPI)

Y_hat_CPI = X_CPI @ beta_CPI
residuals_CPI = Y_CPI - Y_hat_CPI

ss_tot_CPI  = np.sum((Y_CPI - np.mean(Y_CPI)) ** 2)
ss_res_CPI  = np.sum(residuals_CPI ** 2)
r_squared_CPI = 1 - ss_res_CPI / ss_tot_CPI

coef_df_CPI = pd.DataFrame({
    "Variable": var_names_CPI,
    "Beta": beta_CPI.flatten()
})
In [70]:
print("\nCoefficienti stimati (beta_CPI):")
print(coef_df_CPI.to_string(index=False))
print(f"\nR²_CPI: {r_squared_CPI:.6f}")
Coefficienti stimati (beta_CPI):
              Variable      Beta
                 const  0.127399
diff_CPIAUCSL_CPI_lag1  0.420906
diff_CPIAUCSL_CPI_lag2 -0.052911
diff_CPIAUCSL_CPI_lag3  0.100514
diff_CPIAUCSL_CPI_lag4  0.099478
diff_CPIAUCSL_CPI_lag5  0.009028
diff_CPIAUCSL_CPI_lag6  0.080149
           F1_lag1_CPI -0.101769
           F2_lag1_CPI  0.251778
           F3_lag1_CPI -0.178585
           F4_lag1_CPI  0.053161
           F5_lag1_CPI  0.046578
           F6_lag1_CPI -0.158697
           F7_lag1_CPI  0.584645

R²_CPI: 0.411454
In [71]:
# 1.   LAGS TO USE (the same chosen using BIC)
#      p_CPI = number of lags for CPI         m_CPI = number of factor lags

p_CPI = 6
m_CPI = 1

pred_cols_CPI = (
    [f"diff_CPIAUCSL_CPI_lag{i}" for i in range(1, p_CPI + 1)] +
    [f"F{i}_lag{m_CPI}_CPI"       for i in range(1, 8)]
)

# 2.

print("β_CPI available?  shape:", beta_CPI.shape)

# 3.  ITERATIVE h-STEP FORECAST FUNCTION FOR CPI

def forecast_diffCPI(df_CPI: pd.DataFrame,
                     beta_vec: np.ndarray,
                     h: int = 6) -> pd.Series:

    preds, dates = [], []
    last_obs  = df_CPI.iloc[-1].copy()          # last available row
    last_date = pd.to_datetime(df_CPI.index[-1])
    const     = 1.0

    for step in range(1, h + 1):
        X_in   = np.concatenate([[const], last_obs[pred_cols_CPI].values])
        y_pred = float(X_in @ beta_vec)         # forecast ΔCPI
        preds.append(y_pred)
        dates.append(last_date + pd.offsets.MonthBegin(step))

        # ---- update the 6 lags of the variable -------------------------
        for lag in range(p_CPI, 1, -1):
            last_obs[f"diff_CPIAUCSL_CPI_lag{lag}"] = \
                last_obs[f"diff_CPIAUCSL_CPI_lag{lag-1}"]
        last_obs["diff_CPIAUCSL_CPI_lag1"] = last_obs["diff_CPIAUCSL_CPI"]
        last_obs["diff_CPIAUCSL_CPI"]      = y_pred   # new value at t

        # ---- factors: assume constant factors --------------------------
        # (F?_lag1_CPI remain unchanged → no update)

    return pd.Series(preds, index=dates, name="diff_CPIAUCSL_forecast")


# 4.  BUILD THE DATAFRAME WITH REQUIRED REGRESSORS

final_CPI_sel = final_CPI[["diff_CPIAUCSL_CPI"] + pred_cols_CPI]

# 5.  6-MONTH FORECAST

forecast_6m_CPI = forecast_diffCPI(
    final_CPI_sel,
    beta_CPI.flatten(),     # vector (k+1,)
    h=6
)
β_CPI available?  shape: (14, 1)
In [72]:
print("\nCPI forecasts for the next 6 months:")
print(forecast_6m_CPI.round(3))
CPI forecasts for the next 6 months:
2025-04-01    0.539
2025-05-01    0.285
2025-06-01    0.593
2025-07-01    0.309
2025-08-01    0.457
2025-09-01    0.294
Name: diff_CPIAUCSL_forecast, dtype: float64
In [73]:
# 1) last 12 months of observed data
actual_last12_CPI = final_CPI["diff_CPIAUCSL_CPI"].iloc[-12:]

# 2) 6-month forecast
fcst_CPI = forecast_6m_CPI

# 3) concatenate for plotting (unchanged structure)
plot_series_CPI = (
    pd.concat(
        {"diff CPI (observed)": actual_last12_CPI,
         "diff CPI (forecast)":  fcst_CPI}
    )
    .to_frame(name="value")
    .reset_index()
    .rename(columns={"level_0": "Series",
                     "level_1": "Date"})
)

fig, ax = plt.subplots(figsize=(8, 4), dpi=600)

forecast_color = None             ### init

for lbl, grp in plot_series_CPI.groupby("Series"):
    line, = ax.plot(grp["Date"], grp["value"],
                    marker="" if "forecast" in lbl else None,
                    linestyle="-",
                    label=lbl)
    if "forecast" in lbl:          ### store forecast line color
        forecast_color = line.get_color()

# bridge segment with correct color
last_obs_date   = actual_last12_CPI.index[-1]
last_obs_value  = actual_last12_CPI.iloc[-1]
first_fcst_date = fcst_CPI.index[0]
first_fcst_val  = fcst_CPI.iloc[0]

ax.plot([last_obs_date, first_fcst_date],
        [last_obs_value, first_fcst_val],
        color=forecast_color, linestyle="-", linewidth=1.2)

#  plot details
ax.set_title("diff CPIAUCSL: last 12 months observed + 6-month forecast")
ax.set_ylabel("First-difference CPI (pp)")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

2. Real Time Evaluation¶

The dataset is split into 80% as train data and 20% as test data

We act as if we knew data up to t, we estimate the factors and use their lags, in this way we simulate the reality.

E.G. Today we can use just the data we know $\rightarrow$ no future informations

So each 'month' we estimate the model and then make a forecast about next month. When the real info is available we include it in our data and estimate the model again to predict next month observation and so on.

NOTE:

For this real time evaluation we are going to skip the outliers procedure

The model will be re-estimated each month, i.e. each observation

In [74]:
# To work directly with df
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")

Loop for Real Time Evaluation¶

Let's create a silent version of the em_factors function such that we are not gonna have a confusing output

In [75]:
def em_factors_silent(
    df_transformed:   pd.DataFrame,      # NaN “veri”
    df_missing:       pd.DataFrame,      # NaN → μ
    df_standardized:  pd.DataFrame,      # z-score
    *,
    kmax:  int = 15,
    tol:   float = 1e-6,
    max_iter: int = 200,
    print_every: int | None = 0          # 0/None → nessun log
):
    idx, cols = df_transformed.index, df_transformed.columns
    mask_nan  = df_transformed.isna().to_numpy()

    X_std      = df_standardized.to_numpy(float)
    X_hat_prev = np.zeros_like(X_std)
    err, it    = np.inf, 0

    while err > tol and it < max_iter:
        it += 1
        r_star = pc_p2_criterion(X_std, kmax)
        F_hat, Λ_hat, X_hat, _ = pca_stock_watson(X_std, r_star)

        if it > 1:
            err = ((X_hat - X_hat_prev)**2).sum() / (X_hat_prev**2).sum()
        X_hat_prev = X_hat

        if print_every and (it == 1 or it % print_every == 0):
            print(f"Iter {it:3d} | r*={r_star:2d} | err={err:9.2e}")

        μ, σ = X_std.mean(0), X_std.std(0, ddof=0)
        σ[σ == 0] = 1.0
        X_unstd = X_std*σ + μ
        X_unstd[mask_nan] = (X_hat*σ + μ)[mask_nan]

        μn, σn = X_unstd.mean(0), X_unstd.std(0, ddof=0)
        σn[σn == 0] = 1.0
        X_std = (X_unstd - μn) / σn

    return (pd.DataFrame(F_hat, idx, [f"F{i+1}" for i in range(r_star)]),
            pd.DataFrame(Λ_hat, cols, [f"F{i+1}" for i in range(r_star)]),
            r_star,
            pd.DataFrame(X_unstd, idx, cols),
            pd.DataFrame(X_std,    idx, cols),
            pd.DataFrame(X_hat,    idx, cols))

INDPRO (Level)¶

  • Lags of INDPRO = $3$

  • Lags of factors = $1$

In [76]:
# PARAMETERS (same as your original block)
p_IND        = 3          # lags of INDPRO
m_IND        = 1          # lags of factors (can be 1, 2, 3, ...)
kmax_em      = 15
tol_em       = 1e-6
maxiter_em   = 200
progress_mod = 16         # print ETA every 16 iterations

# DATA SPLIT
df_train = df_transformed.loc[:'2011-11-01'].copy()
df_test  = df_transformed.loc['2011-12-01':].copy()

fcst_dates, fcst_values = [], []
tic, steps_total, step_count = time.time(), len(df_test), 0

# ROLLING-WINDOW LOOP
while not df_test.empty:

    # 1) fill NaNs + standardize
    df_nan = df_train.fillna(df_train.mean())
    df_std = (df_nan - df_nan.mean()) / df_nan.std()

    # 2) extract factors via EM (silent)
    factors_train, _, r_opt, *_ = em_factors_silent(
        df_transformed  = df_train,
        df_missing      = df_nan,
        df_standardized = df_std,
        kmax            = kmax_em,
        tol             = tol_em,
        max_iter        = maxiter_em,
        print_every     = 0
    )

    # 3) regressors = lags 1…m_IND of factors + lags 1…p_IND of INDPRO
    lag_F = [factors_train.shift(l).add_suffix(f"_lag{l}")
             for l in range(1, m_IND + 1)]
    lag_Y = [df.loc[:, "INDPRO"].shift(l).to_frame(f"INDPRO_lag{l}")
             for l in range(1, p_IND + 1)]
    X_train = pd.concat(lag_F + lag_Y, axis=1).dropna()

    # 4) target Y
    Y_train = df.loc[X_train.index, "INDPRO"]

    # 5) OLS
    Xc   = np.hstack([np.ones((len(X_train), 1)), X_train.values])
    beta = np.linalg.inv(Xc.T @ Xc) @ (Xc.T @ Y_train.values.reshape(-1, 1))

    # 6) regressor vector for t+1
    #    • factor blocks: F(t), F(t-1)…F(t-(m_IND-1))
    F_blocks = [factors_train.shift(l).iloc[-1].values
                for l in range(0, m_IND)]          # 0 = current
    #    • INDPRO scalars: lag 1…p_IND
    Y_scalars = [df.loc[df_train.index[-1-l], "INDPRO"]
                 for l in range(p_IND)]

    x_new = np.hstack([1.0, *F_blocks, *Y_scalars])
    y_hat = float((x_new @ beta).squeeze())        # no warnings

    # 7) save forecast
    next_date = df_test.index[0]
    fcst_dates.append(next_date)
    fcst_values.append(y_hat)

    # 8) roll-forward
    df_train = pd.concat([df_train, df_test.iloc[[0]]])
    df_test  = df_test.iloc[1:]

    # 9) progress bar
    step_count += 1
    if step_count % progress_mod == 0 or step_count == steps_total:
        elapsed = time.time() - tic
        eta_sec = (elapsed / step_count) * (steps_total - step_count)
        eta_str = time.strftime('%H:%M:%S', time.gmtime(eta_sec))
        print(f"[{step_count:>3}/{steps_total}]  r*={r_opt:2d}  ETA ≈ {eta_str}")

# OUT-OF-SAMPLE FORECAST SERIES
forecast_INDPRO_oos = pd.Series(fcst_values,
                                index=pd.to_datetime(fcst_dates),
                                name="INDPRO_forecast")
[ 16/160]  r*= 7  ETA ≈ 00:02:40
[ 32/160]  r*= 7  ETA ≈ 00:02:16
[ 48/160]  r*= 7  ETA ≈ 00:02:00
[ 64/160]  r*= 7  ETA ≈ 00:01:44
[ 80/160]  r*= 7  ETA ≈ 00:01:33
[ 96/160]  r*= 7  ETA ≈ 00:01:16
[112/160]  r*= 7  ETA ≈ 00:01:07
[128/160]  r*= 7  ETA ≈ 00:00:53
[144/160]  r*= 7  ETA ≈ 00:00:29
[160/160]  r*= 7  ETA ≈ 00:00:00
In [77]:
# 1. Extract the actual INDPRO series corresponding to the OOS forecast dates
y_true = df.loc[forecast_INDPRO_oos.index, "INDPRO"].astype(float)

# 2. Forecast series (already in correct order and with same index)
y_pred = forecast_INDPRO_oos.astype(float)

# 3. MSE and RMSE
mse  = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)

print(f"MSE  : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
MSE  : 1.8641
RMSE : 1.3653
In [78]:
# Actual series and one-step-ahead forecasts
actual_IND  = y_true               # observed out-of-sample values
pred_IND    = forecast_INDPRO_oos.reindex(actual_IND.index)   # align on the same dates

# Plot
fig, ax = plt.subplots(figsize=(10,4), dpi=500)

ax.plot(actual_IND.index, actual_IND.values,
        label="INDPRO (observed)", color="tab:blue")
ax.plot(pred_IND.index,   pred_IND.values,
        label="INDPRO (1-step forecast)", color="tab:orange", linestyle="-")

ax.set_title("INDPRO – one-step-ahead rolling forecasts vs. actual")
ax.set_ylabel("Index level")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

CPIAUSCL (First Difference)¶

  • Lags of CPIAUSCL = $6$

  • Lags of factors = $1$

In [79]:
# TARGET VARIABLE (Δ CPI)
df_diff = df.copy()                       # df = levels, already in workspace
df_diff["diff_CPIAUCSL"] = df["CPIAUCSL"].diff()
df_diff = df_diff.iloc[1:]                # drop first NaN row

# align indices: keep only common dates
common_idx      = df_transformed.index.intersection(df_diff.index)
df_diff         = df_diff.loc[common_idx].copy()
df_transf_diff  = df_transformed.loc[common_idx].copy()   # now same dates

# PARAMETERS
p_CPI        = 6          # lags of ΔCPI to use as regressors
m_CPI        = 1          # lags of factors
kmax_em      = 15
tol_em       = 1e-6
maxiter_em   = 200
progress_mod = 16

# TRAIN / TEST SPLIT
df_train = df_transf_diff.loc[:'2011-11-01'].copy()
df_test  = df_transf_diff.loc['2011-12-01':].copy()

fcst_dates, fcst_values = [], []
tic, steps_total, step_count = time.time(), len(df_test), 0

# ROLLING-WINDOW ONE-STEP-AHEAD – ΔCPI
while not df_test.empty:

    # 1) NaNs → mean  +  standardize
    df_nan = df_train.fillna(df_train.mean())
    df_std = (df_nan - df_nan.mean()) / df_nan.std()

    # 2) extract factors via EM
    F_train, _, r_opt, *_ = em_factors_silent(
        df_transformed  = df_train,
        df_missing      = df_nan,
        df_standardized = df_std,
        kmax            = kmax_em,
        tol             = tol_em,
        max_iter        = maxiter_em,
        print_every     = 0
    )

    # 3) X = lag(1…m_CPI) of factors + lag(1…p_CPI) of ΔCPI
    lag_F = [F_train.shift(l).add_suffix(f"_lag{l}")
             for l in range(1, m_CPI + 1)]
    lag_Y = [df_diff["diff_CPIAUCSL"].shift(l).to_frame(f"diff_CPI_lag{l}")
             for l in range(1, p_CPI + 1)]
    X_train = pd.concat(lag_F + lag_Y, axis=1).dropna()

    # 4) target Y = aligned ΔCPI
    Y_train = df_diff.loc[X_train.index, "diff_CPIAUCSL"]

    # 5) OLS
    Xc   = np.hstack([np.ones((len(X_train), 1)), X_train.values])
    beta = np.linalg.inv(Xc.T @ Xc) @ (Xc.T @ Y_train.values.reshape(-1, 1))

    # 6) regressor vector for t+1
    F_blocks  = [F_train.shift(l).iloc[-1].values for l in range(0, m_CPI)]
    Y_scalars = [df_diff.loc[df_train.index[-1-l], "diff_CPIAUCSL"]
                 for l in range(p_CPI)]
    x_new = np.hstack([1.0, *F_blocks, *Y_scalars])
    y_hat = float((x_new @ beta).squeeze())      # forecast ΔCPI_{t+1}

    # 7) save
    fcst_dates.append(df_test.index[0])
    fcst_values.append(y_hat)

    # 8) roll-forward
    df_train = pd.concat([df_train, df_test.iloc[[0]]])
    df_test  = df_test.iloc[1:]

    # 9) progress bar
    step_count += 1
    if step_count % progress_mod == 0 or step_count == steps_total:
        elapsed = time.time() - tic
        eta = time.strftime('%H:%M:%S',
                            time.gmtime((elapsed/step_count)*(steps_total-step_count)))
        print(f"[{step_count:>3}/{steps_total}]  r*={r_opt:2d}  ETA ≈ {eta}")

# OOS FORECAST SERIES (ΔCPI)
forecast_CPI_oos = pd.Series(fcst_values,
                             index=pd.to_datetime(fcst_dates),
                             name="diff_CPIAUCSL_forecast")
[ 16/160]  r*= 7  ETA ≈ 00:03:50
[ 32/160]  r*= 7  ETA ≈ 00:02:56
[ 48/160]  r*= 7  ETA ≈ 00:02:36
[ 64/160]  r*= 7  ETA ≈ 00:02:15
[ 80/160]  r*= 7  ETA ≈ 00:01:52
[ 96/160]  r*= 7  ETA ≈ 00:01:29
[112/160]  r*= 7  ETA ≈ 00:01:16
[128/160]  r*= 7  ETA ≈ 00:00:57
[144/160]  r*= 7  ETA ≈ 00:00:31
[160/160]  r*= 7  ETA ≈ 00:00:00
In [80]:
# 1. Extract true ΔCPI values for forecast dates
y_true_CPI = df_diff.loc[forecast_CPI_oos.index,
                         "diff_CPIAUCSL"].astype(float)

# 2. Forecast series (already aligned)
y_pred_CPI = forecast_CPI_oos.astype(float)

# 3. MSE / RMSE
mse_CPI  = np.mean((y_true_CPI - y_pred_CPI) ** 2)
rmse_CPI = np.sqrt(mse_CPI)

print(f"MSE  (CPI) : {mse_CPI:.6f}")
print(f"RMSE (CPI) : {rmse_CPI:.6f}")
MSE  (CPI) : 0.360018
RMSE (CPI) : 0.600015
In [81]:
# actual and predicted series
actual_CPI = y_true_CPI                      # observed OOS values
pred_CPI   = y_pred_CPI.reindex(actual_CPI.index)   # already same indices

fig, ax = plt.subplots(figsize=(10, 4), dpi=500)

ax.plot(actual_CPI.index, actual_CPI.values,
        label="CPI (observed)", color="tab:blue")
ax.plot(pred_CPI.index,   pred_CPI.values,
        label="CPI (1-step forecast)", color="tab:orange")

ax.set_title("First Difference of CPI – one-step-ahead rolling forecasts vs. actual")
ax.set_ylabel("First difference CPI (pp)")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image